# -*-Python-*-

# Some improvements on t5.1.0.large:
#  - GEGLU-activated feedforward layers instead of RELU
#    see https://arxiv.org/abs/2002.05202
#    Note that d_ff goes down by a factor of 2/3 to compensate for the extra
#    projection.
#  - no dropout on training
#  - no sharing of embedding weights with classifier layer.
#
# Also note that (although it is not dictated by this config), the checkpoints
#   released for this config were trained on C4 only rather than a mix of tasks.

include 'models/t5.1.1.base.gin'

d_model = 1024
num_layers = 24
d_ff = 2816
num_heads = 16
utils.tpu_mesh_shape.model_parallelism = 2
